Call in libraries

library(lubridate)

Attaching package: ‘lubridate’

The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union

Read in data set

avocado <- read_csv("data/avocado.csv") %>% janitor::clean_names()
New names:
• `` -> `...1`
Rows: 18249 Columns: 14
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr   (2): type, region
dbl  (11): ...1, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, year
date  (1): Date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Use glimpse

glimpse(avocado)
Rows: 18,249
Columns: 14
$ x1            <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,…
$ date          <date> 2015-12-27, 2015-12-20, 2015-12-13, 2015-12-06, 2015-11-29, 2015-11-22, 2015-11-15, 2015-11-08, 2015-11-01, …
$ average_price <dbl> 1.33, 1.35, 0.93, 1.08, 1.28, 1.26, 0.99, 0.98, 1.02, 1.07, 1.12, 1.28, 1.31, 0.99, 1.33, 1.28, 1.11, 1.07, 1…
$ total_volume  <dbl> 64236.62, 54876.98, 118220.22, 78992.15, 51039.60, 55979.78, 83453.76, 109428.33, 99811.42, 74338.76, 84843.4…
$ x4046         <dbl> 1036.74, 674.28, 794.70, 1132.00, 941.48, 1184.27, 1368.92, 703.75, 1022.15, 842.40, 924.86, 1582.03, 2268.32…
$ x4225         <dbl> 54454.85, 44638.81, 109149.67, 71976.41, 43838.39, 48067.99, 73672.72, 101815.36, 87315.57, 64757.44, 75595.8…
$ x4770         <dbl> 48.16, 58.33, 130.50, 72.58, 75.78, 43.61, 93.26, 80.00, 85.34, 113.00, 117.07, 105.32, 101.36, 154.84, 150.5…
$ total_bags    <dbl> 8696.87, 9505.56, 8145.35, 5811.16, 6183.95, 6683.91, 8318.86, 6829.22, 11388.36, 8625.92, 8205.66, 10123.90,…
$ small_bags    <dbl> 8603.62, 9408.07, 8042.21, 5677.40, 5986.26, 6556.47, 8196.81, 6266.85, 11104.53, 8061.47, 7877.86, 9866.27, …
$ large_bags    <dbl> 93.25, 97.49, 103.14, 133.76, 197.69, 127.44, 122.05, 562.37, 283.83, 564.45, 327.80, 257.63, 376.77, 145.59,…
$ x_large_bags  <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0…
$ type          <chr> "conventional", "conventional", "conventional", "conventional", "conventional", "conventional", "conventional…
$ year          <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2…
$ region        <chr> "Albany", "Albany", "Albany", "Albany", "Albany", "Albany", "Albany", "Albany", "Albany", "Albany", "Albany",…

Use skim

library(skimr)

skim(avocado)
── Data Summary ────────────────────────
                           Values 
Name                       avocado
Number of rows             18249  
Number of columns          14     
_______________________           
Column type frequency:            
  character                2      
  Date                     1      
  numeric                  11     
________________________          
Group variables            None   

Look at first six rows

avocado %>% 
  head()

Look at distribution of average prices

avocado %>% 
  ggplot(aes(x = average_price)) +
  geom_histogram() 
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Find if there are any aliases

alias(average_price ~ .,
      data = avocado)
Model :
average_price ~ x1 + date + total_volume + x4046 + x4225 + x4770 + 
    total_bags + small_bags + large_bags + x_large_bags + type + 
    year + region

Find all the distinct regions

avocado %>% 
  distinct(region)

Feature Engineering

avocado <- avocado %>% 
  mutate(month = month(date, label = TRUE, abbr = FALSE)) %>% 
  mutate(total_single = x4046 + x4225 + x4770,
         .after = total_volume) %>% 
  mutate(year = as.factor(year)) %>% 
  mutate(region = case_when(
    str_detect(region, "Albany") ~ "Northeast",
    str_detect(region, "Atlanta") ~ "Southeast",
    str_detect(region, "BaltimoreWashington") ~ "Northeast",
    str_detect(region, "Boise") ~ "West",
    str_detect(region, "Boston") ~ "Northeast",
    str_detect(region, "BuffaloRochester") ~ "Northeast",
    str_detect(region, "California") ~ "California",
    str_detect(region, "Charlotte") ~ "Midsouth",
    str_detect(region, "Chicago") ~ "Great Lakes",
    str_detect(region, "CincinnatiDayton") ~ "Great Lakes",
    str_detect(region, "Columbus") ~ "Great Lakes",
    str_detect(region, "DallasFtWorth") ~ "South Central",
    str_detect(region, "Denver") ~ "West",
    str_detect(region, "Detroit") ~ "Great Lakes",
    str_detect(region, "GrandRapids") ~ "Great Lakes",
    str_detect(region, "GreatLakes") ~ "Great Lakes",
    str_detect(region, "HarrisburgScranton") ~ "Northeast",
    str_detect(region, "Houston") ~ "South Central",
    str_detect(region, "HartfordSpringfield") ~ "Northeast",
    str_detect(region, "Indianapolis") ~ "Great Lakes",
    str_detect(region, "Jacksonville") ~ "Southeast",
    str_detect(region, "LasVegas") ~ "West",
    str_detect(region, "LosAngeles") ~ "California",
    str_detect(region, "Louisville") ~ "Midsouth",
    str_detect(region, "MiamiFtLauderdale") ~ "Southeast",
    str_detect(region, "Midsouth") ~ "Midsouth",
    str_detect(region, "Nashville") ~ "Midsouth",
    str_detect(region, "NewOrleansMobile") ~ "South Central",
    str_detect(region, "NewYork") ~ "Northeast",
    str_detect(region, "Northeast") ~ "Northeast",
    str_detect(region, "NorthernNewEngland") ~ "Northeast",
    str_detect(region, "Orlando") ~ "Southeast",
    str_detect(region, "Philadelphia") ~ "Northeast",
    str_detect(region, "PhoenixTucson") ~ "West",
    str_detect(region, "Pittsburgh") ~ "Northeast",
    str_detect(region, "Plains") ~ "Plains",
    str_detect(region, "Portland") ~ "West",
    str_detect(region, "RaleighGreensboro") ~ "Midsouth",
    str_detect(region, "RichmondNorfolk") ~ "Midsouth",
    str_detect(region, "Roanoke") ~ "Midsouth",
    str_detect(region, "Sacramento") ~ "California",
    str_detect(region, "SanDiego") ~ "California",
    str_detect(region, "SanFrancisco") ~ "California",
    str_detect(region, "Seattle") ~ "West",
    str_detect(region, "SouthCarolina") ~ "Southeast",
    str_detect(region, "SouthCentral") ~ "South Central",
    str_detect(region, "Southeast") ~ "Southeast",
    str_detect(region, "Spokane") ~ "West",
    str_detect(region, "StLouis") ~ "Plains",
    str_detect(region, "Syracuse") ~ "Northeast",
    str_detect(region, "Tampa") ~ "Southeast",
    str_detect(region, "TotalUS") ~ "TotalUS",
    str_detect(region, "West") ~ "West",
    str_detect(region, "WestTexNewMexico") ~ "South Central",
    TRUE ~ region
    )) %>% 
  mutate(is_organic = type == "organic")

Feature Engineering cont

avocado <- avocado %>% 
  select(average_price, total_single, total_bags, is_organic, year, region, month)

Further feature engineering:

Split the data into test/train data sets (90/10)

n_row
[1] 18249

Initial ggpairs

avocado_train %>% 
  select(average_price, everything()) %>% 
  ggpairs()

 plot: [1,1] [>-----------------------------------------------------------------------------------------------------]  1% est: 0s 
 plot: [1,2] [=>----------------------------------------------------------------------------------------------------]  2% est: 4s 
 plot: [1,3] [==>---------------------------------------------------------------------------------------------------]  3% est: 4s 
 plot: [1,4] [===>--------------------------------------------------------------------------------------------------]  4% est: 4s 
 plot: [1,5] [====>-------------------------------------------------------------------------------------------------]  5% est: 4s 
 plot: [1,6] [=====>------------------------------------------------------------------------------------------------]  6% est: 5s 
 plot: [1,7] [======>-----------------------------------------------------------------------------------------------]  7% est: 6s 
 plot: [1,8] [=======>----------------------------------------------------------------------------------------------]  8% est: 6s 
 plot: [1,9] [========>---------------------------------------------------------------------------------------------]  9% est: 6s 
 plot: [1,10] [=========>-------------------------------------------------------------------------------------------] 10% est: 6s 
 plot: [2,1] [==========>-------------------------------------------------------------------------------------------] 11% est: 6s 
 plot: [2,2] [===========>------------------------------------------------------------------------------------------] 12% est: 6s 
 plot: [2,3] [============>-----------------------------------------------------------------------------------------] 13% est: 6s 
 plot: [2,4] [=============>----------------------------------------------------------------------------------------] 14% est: 5s 
 plot: [2,5] [==============>---------------------------------------------------------------------------------------] 15% est: 5s 
 plot: [2,6] [===============>--------------------------------------------------------------------------------------] 16% est: 5s 
 plot: [2,7] [================>-------------------------------------------------------------------------------------] 17% est: 5s 
 plot: [2,8] [=================>------------------------------------------------------------------------------------] 18% est: 5s 
 plot: [2,9] [==================>-----------------------------------------------------------------------------------] 19% est: 5s 
 plot: [2,10] [===================>---------------------------------------------------------------------------------] 20% est: 5s 
 plot: [3,1] [====================>---------------------------------------------------------------------------------] 21% est: 5s 
 plot: [3,2] [=====================>--------------------------------------------------------------------------------] 22% est: 5s 
 plot: [3,3] [======================>-------------------------------------------------------------------------------] 23% est: 5s 
 plot: [3,4] [=======================>------------------------------------------------------------------------------] 24% est: 5s 
 plot: [3,5] [=========================>----------------------------------------------------------------------------] 25% est: 4s 
 plot: [3,6] [==========================>---------------------------------------------------------------------------] 26% est: 5s 
 plot: [3,7] [===========================>--------------------------------------------------------------------------] 27% est: 5s 
 plot: [3,8] [============================>-------------------------------------------------------------------------] 28% est: 5s 
 plot: [3,9] [=============================>------------------------------------------------------------------------] 29% est: 5s 
 plot: [3,10] [=============================>-----------------------------------------------------------------------] 30% est: 4s 
 plot: [4,1] [===============================>----------------------------------------------------------------------] 31% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [4,2] [================================>---------------------------------------------------------------------] 32% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [4,3] [=================================>--------------------------------------------------------------------] 33% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [4,4] [==================================>-------------------------------------------------------------------] 34% est: 4s 
 plot: [4,5] [===================================>------------------------------------------------------------------] 35% est: 4s 
 plot: [4,6] [====================================>-----------------------------------------------------------------] 36% est: 4s 
 plot: [4,7] [=====================================>----------------------------------------------------------------] 37% est: 4s 
 plot: [4,8] [======================================>---------------------------------------------------------------] 38% est: 4s 
 plot: [4,9] [=======================================>--------------------------------------------------------------] 39% est: 4s 
 plot: [4,10] [=======================================>-------------------------------------------------------------] 40% est: 4s 
 plot: [5,1] [=========================================>------------------------------------------------------------] 41% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [5,2] [==========================================>-----------------------------------------------------------] 42% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [5,3] [===========================================>----------------------------------------------------------] 43% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [5,4] [============================================>---------------------------------------------------------] 44% est: 4s 
 plot: [5,5] [=============================================>--------------------------------------------------------] 45% est: 4s 
 plot: [5,6] [==============================================>-------------------------------------------------------] 46% est: 4s 
 plot: [5,7] [===============================================>------------------------------------------------------] 47% est: 4s 
 plot: [5,8] [================================================>-----------------------------------------------------] 48% est: 4s 
 plot: [5,9] [=================================================>----------------------------------------------------] 49% est: 4s 
 plot: [5,10] [=================================================>---------------------------------------------------] 50% est: 4s 
 plot: [6,1] [===================================================>--------------------------------------------------] 51% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [6,2] [====================================================>-------------------------------------------------] 52% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [6,3] [=====================================================>------------------------------------------------] 53% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [6,4] [======================================================>-----------------------------------------------] 54% est: 4s 
 plot: [6,5] [=======================================================>----------------------------------------------] 55% est: 4s 
 plot: [6,6] [========================================================>---------------------------------------------] 56% est: 4s 
 plot: [6,7] [=========================================================>--------------------------------------------] 57% est: 4s 
 plot: [6,8] [==========================================================>-------------------------------------------] 58% est: 4s 
 plot: [6,9] [===========================================================>------------------------------------------] 59% est: 4s 
 plot: [6,10] [============================================================>----------------------------------------] 60% est: 4s 
 plot: [7,1] [=============================================================>----------------------------------------] 61% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [7,2] [==============================================================>---------------------------------------] 62% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [7,3] [===============================================================>--------------------------------------] 63% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [7,4] [================================================================>-------------------------------------] 64% est: 4s 
 plot: [7,5] [=================================================================>------------------------------------] 65% est: 4s 
 plot: [7,6] [==================================================================>-----------------------------------] 66% est: 4s 
 plot: [7,7] [===================================================================>----------------------------------] 67% est: 4s 
 plot: [7,8] [====================================================================>---------------------------------] 68% est: 4s 
 plot: [7,9] [=====================================================================>--------------------------------] 69% est: 3s 
 plot: [7,10] [======================================================================>------------------------------] 70% est: 3s 
 plot: [8,1] [=======================================================================>------------------------------] 71% est: 3s 
 plot: [8,2] [========================================================================>-----------------------------] 72% est: 3s 
 plot: [8,3] [=========================================================================>----------------------------] 73% est: 3s 
 plot: [8,4] [==========================================================================>---------------------------] 74% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [8,5] [===========================================================================>--------------------------] 75% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [8,6] [=============================================================================>------------------------] 76% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [8,7] [==============================================================================>-----------------------] 77% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [8,8] [===============================================================================>----------------------] 78% est: 3s 
 plot: [8,9] [================================================================================>---------------------] 79% est: 3s 
 plot: [8,10] [================================================================================>--------------------] 80% est: 2s 
 plot: [9,1] [==================================================================================>-------------------] 81% est: 2s 
 plot: [9,2] [===================================================================================>------------------] 82% est: 2s 
 plot: [9,3] [====================================================================================>-----------------] 83% est: 2s 
 plot: [9,4] [=====================================================================================>----------------] 84% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [9,5] [======================================================================================>---------------] 85% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [9,6] [=======================================================================================>--------------] 86% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [9,7] [========================================================================================>-------------] 87% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [9,8] [=========================================================================================>------------] 88% est: 1s 
 plot: [9,9] [==========================================================================================>-----------] 89% est: 1s 
 plot: [9,10] [==========================================================================================>----------] 90% est: 1s 
 plot: [10,1] [===========================================================================================>---------] 91% est: 1s 
 plot: [10,2] [============================================================================================>--------] 92% est: 1s 
 plot: [10,3] [=============================================================================================>-------] 93% est: 1s 
 plot: [10,4] [==============================================================================================>------] 94% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [10,5] [===============================================================================================>-----] 95% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [10,6] [================================================================================================>----] 96% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [10,7] [=================================================================================================>---] 97% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [10,8] [==================================================================================================>--] 98% est: 0s 
 plot: [10,9] [===================================================================================================>-] 99% est: 0s 
 plot: [10,10] [====================================================================================================]100% est: 0s 
                                                                                                                                  

Based on this, look at the following for the first predictor:

Plot ln_total_bag histogram

avocado_train %>% 
  ggplot(aes(x = ln_total_bags))+
  geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot is_organic boxplot

avocado_train %>% 
  ggplot(aes(x = is_organic,
             y = average_price))+
  geom_boxplot()

Create first model

mod1a <- lm(average_price ~ ln_total_bags,
            data = avocado_train)

autoplot(mod1a)

summary(mod1a)

Call:
lm(formula = average_price ~ ln_total_bags, data = avocado_train)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.0395 -0.2346 -0.0377  0.1976  1.6295 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    2.448530   0.011805  207.41   <2e-16 ***
ln_total_bags -0.102457   0.001131  -90.57   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3276 on 16423 degrees of freedom
Multiple R-squared:  0.3331,    Adjusted R-squared:  0.3331 
F-statistic:  8203 on 1 and 16423 DF,  p-value: < 2.2e-16

Results:

mod1b <- lm(average_price ~ is_organic,
            data = avocado_train)

autoplot(mod1b)

summary(mod1b)

Call:
lm(formula = average_price ~ is_organic, data = avocado_train)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.21288 -0.19668 -0.02668  0.18332  1.59712 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    1.156682   0.003478   332.6   <2e-16 ***
is_organicTRUE 0.496196   0.004920   100.9   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3152 on 16423 degrees of freedom
Multiple R-squared:  0.3825,    Adjusted R-squared:  0.3825 
F-statistic: 1.017e+04 on 1 and 16423 DF,  p-value: < 2.2e-16

Results:

Look at and plot residuals:

avocado_resid <- avocado_train %>% 
  add_residuals(mod1b) %>% 
  select(-c(average_price, is_organic))

avocado_resid %>% 
  select(resid, everything()) %>% 
  ggpairs()

 plot: [1,1] [>-----------------------------------------------------------------------------------------------------]  1% est: 0s 
 plot: [1,2] [==>---------------------------------------------------------------------------------------------------]  2% est: 3s 
 plot: [1,3] [===>--------------------------------------------------------------------------------------------------]  4% est: 3s 
 plot: [1,4] [====>-------------------------------------------------------------------------------------------------]  5% est: 3s 
 plot: [1,5] [=====>------------------------------------------------------------------------------------------------]  6% est: 3s 
 plot: [1,6] [=======>----------------------------------------------------------------------------------------------]  7% est: 4s 
 plot: [1,7] [========>---------------------------------------------------------------------------------------------]  9% est: 5s 
 plot: [1,8] [=========>--------------------------------------------------------------------------------------------] 10% est: 4s 
 plot: [1,9] [==========>-------------------------------------------------------------------------------------------] 11% est: 4s 
 plot: [2,1] [============>-----------------------------------------------------------------------------------------] 12% est: 4s 
 plot: [2,2] [=============>----------------------------------------------------------------------------------------] 14% est: 4s 
 plot: [2,3] [==============>---------------------------------------------------------------------------------------] 15% est: 4s 
 plot: [2,4] [===============>--------------------------------------------------------------------------------------] 16% est: 4s 
 plot: [2,5] [=================>------------------------------------------------------------------------------------] 17% est: 4s 
 plot: [2,6] [==================>-----------------------------------------------------------------------------------] 19% est: 4s 
 plot: [2,7] [===================>----------------------------------------------------------------------------------] 20% est: 4s 
 plot: [2,8] [====================>---------------------------------------------------------------------------------] 21% est: 4s 
 plot: [2,9] [======================>-------------------------------------------------------------------------------] 22% est: 4s 
 plot: [3,1] [=======================>------------------------------------------------------------------------------] 23% est: 4s 
 plot: [3,2] [========================>-----------------------------------------------------------------------------] 25% est: 4s 
 plot: [3,3] [=========================>----------------------------------------------------------------------------] 26% est: 4s 
 plot: [3,4] [===========================>--------------------------------------------------------------------------] 27% est: 3s 
 plot: [3,5] [============================>-------------------------------------------------------------------------] 28% est: 3s 
 plot: [3,6] [=============================>------------------------------------------------------------------------] 30% est: 3s 
 plot: [3,7] [==============================>-----------------------------------------------------------------------] 31% est: 3s 
 plot: [3,8] [================================>---------------------------------------------------------------------] 32% est: 3s 
 plot: [3,9] [=================================>--------------------------------------------------------------------] 33% est: 3s 
 plot: [4,1] [==================================>-------------------------------------------------------------------] 35% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [4,2] [====================================>-----------------------------------------------------------------] 36% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [4,3] [=====================================>----------------------------------------------------------------] 37% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [4,4] [======================================>---------------------------------------------------------------] 38% est: 4s 
 plot: [4,5] [=======================================>--------------------------------------------------------------] 40% est: 3s 
 plot: [4,6] [=========================================>------------------------------------------------------------] 41% est: 3s 
 plot: [4,7] [==========================================>-----------------------------------------------------------] 42% est: 3s 
 plot: [4,8] [===========================================>----------------------------------------------------------] 43% est: 3s 
 plot: [4,9] [============================================>---------------------------------------------------------] 44% est: 3s 
 plot: [5,1] [==============================================>-------------------------------------------------------] 46% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [5,2] [===============================================>------------------------------------------------------] 47% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [5,3] [================================================>-----------------------------------------------------] 48% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [5,4] [=================================================>----------------------------------------------------] 49% est: 4s 
 plot: [5,5] [===================================================>--------------------------------------------------] 51% est: 4s 
 plot: [5,6] [====================================================>-------------------------------------------------] 52% est: 3s 
 plot: [5,7] [=====================================================>------------------------------------------------] 53% est: 3s 
 plot: [5,8] [======================================================>-----------------------------------------------] 54% est: 3s 
 plot: [5,9] [========================================================>---------------------------------------------] 56% est: 3s 
 plot: [6,1] [=========================================================>--------------------------------------------] 57% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [6,2] [==========================================================>-------------------------------------------] 58% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [6,3] [===========================================================>------------------------------------------] 59% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [6,4] [=============================================================>----------------------------------------] 60% est: 3s 
 plot: [6,5] [==============================================================>---------------------------------------] 62% est: 3s 
 plot: [6,6] [===============================================================>--------------------------------------] 63% est: 3s 
 plot: [6,7] [================================================================>-------------------------------------] 64% est: 3s 
 plot: [6,8] [==================================================================>-----------------------------------] 65% est: 3s 
 plot: [6,9] [===================================================================>----------------------------------] 67% est: 3s 
 plot: [7,1] [====================================================================>---------------------------------] 68% est: 3s 
 plot: [7,2] [======================================================================>-------------------------------] 69% est: 3s 
 plot: [7,3] [=======================================================================>------------------------------] 70% est: 3s 
 plot: [7,4] [========================================================================>-----------------------------] 72% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [7,5] [=========================================================================>----------------------------] 73% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [7,6] [===========================================================================>--------------------------] 74% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [7,7] [============================================================================>-------------------------] 75% est: 2s 
 plot: [7,8] [=============================================================================>------------------------] 77% est: 2s 
 plot: [7,9] [==============================================================================>-----------------------] 78% est: 2s 
 plot: [8,1] [================================================================================>---------------------] 79% est: 2s 
 plot: [8,2] [=================================================================================>--------------------] 80% est: 2s 
 plot: [8,3] [==================================================================================>-------------------] 81% est: 2s 
 plot: [8,4] [===================================================================================>------------------] 83% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [8,5] [=====================================================================================>----------------] 84% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [8,6] [======================================================================================>---------------] 85% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [8,7] [=======================================================================================>--------------] 86% est: 1s 
 plot: [8,8] [========================================================================================>-------------] 88% est: 1s 
 plot: [8,9] [==========================================================================================>-----------] 89% est: 1s 
 plot: [9,1] [===========================================================================================>----------] 90% est: 1s 
 plot: [9,2] [============================================================================================>---------] 91% est: 1s 
 plot: [9,3] [=============================================================================================>--------] 93% est: 1s 
 plot: [9,4] [===============================================================================================>------] 94% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [9,5] [================================================================================================>-----] 95% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [9,6] [=================================================================================================>----] 96% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [9,7] [==================================================================================================>---] 98% est: 0s 
 plot: [9,8] [====================================================================================================>-] 99% est: 0s 
 plot: [9,9] [======================================================================================================]100% est: 0s 
                                                                                                                                  

Based on this, look at the following for the second predictor:

Plot region boxplot

avocado_train %>% 
  ggplot(aes(x = region,
             y = average_price))+
  geom_boxplot()

Plot the month boxplot

avocado_train %>% 
  ggplot(aes(x = month,
             y = average_price))+
  geom_boxplot()

mod2a <- lm(average_price ~ is_organic + ln_total_bags,
            data = avocado_train)

autoplot(mod2a)

summary(mod2a)

Call:
lm(formula = average_price ~ is_organic + ln_total_bags, data = avocado_train)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.09530 -0.20351 -0.02322  0.18008  1.57438 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     1.727684   0.018422   93.79   <2e-16 ***
is_organicTRUE  0.338227   0.006923   48.86   <2e-16 ***
ln_total_bags  -0.048296   0.001532  -31.53   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3061 on 16422 degrees of freedom
Multiple R-squared:  0.4178,    Adjusted R-squared:  0.4177 
F-statistic:  5891 on 2 and 16422 DF,  p-value: < 2.2e-16

Results:

mod2b <- lm(average_price ~ is_organic + region,
            data = avocado_train)

autoplot(mod2b)

summary(mod2b)

Call:
lm(formula = average_price ~ is_organic + region, data = avocado_train)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.15982 -0.18145 -0.02334  0.15423  1.51673 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)          1.236815   0.007845 157.655  < 2e-16 ***
is_organicTRUE       0.496453   0.004571 108.612  < 2e-16 ***
regionGreat Lakes   -0.133450   0.009831 -13.575  < 2e-16 ***
regionMidsouth      -0.113476   0.009827 -11.547  < 2e-16 ***
regionNortheast      0.078179   0.008930   8.755  < 2e-16 ***
regionPlains        -0.052484   0.014043  -3.737 0.000187 ***
regionSouth Central -0.353895   0.011244 -31.475  < 2e-16 ***
regionSoutheast     -0.057496   0.009842  -5.842 5.27e-09 ***
regionTotalUS       -0.163394   0.018349  -8.905  < 2e-16 ***
regionWest          -0.162423   0.009373 -17.330  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2929 on 16415 degrees of freedom
Multiple R-squared:  0.4672,    Adjusted R-squared:  0.4669 
F-statistic:  1599 on 9 and 16415 DF,  p-value: < 2.2e-16

Results:

mod2c <- lm(average_price ~ is_organic + month,
            data = avocado_train)

autoplot(mod2c)

summary(mod2c)

Call:
lm(formula = average_price ~ is_organic + month, data = avocado_train)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.1497 -0.1956 -0.0206  0.1868  1.5494 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     1.164277   0.003319 350.812  < 2e-16 ***
is_organicTRUE  0.496363   0.004676 106.144  < 2e-16 ***
month.L         0.212495   0.007952  26.721  < 2e-16 ***
month.Q        -0.152996   0.008071 -18.957  < 2e-16 ***
month.C        -0.189055   0.008083 -23.391  < 2e-16 ***
month^4        -0.073337   0.008020  -9.145  < 2e-16 ***
month^5        -0.006010   0.008131  -0.739 0.459796    
month^6         0.060764   0.008248   7.367 1.83e-13 ***
month^7        -0.001815   0.008065  -0.225 0.821930    
month^8         0.031413   0.008105   3.876 0.000107 ***
month^9         0.014378   0.008340   1.724 0.084731 .  
month^10       -0.025341   0.008375  -3.026 0.002485 ** 
month^11        0.012087   0.008410   1.437 0.150667    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2996 on 16412 degrees of freedom
Multiple R-squared:  0.4426,    Adjusted R-squared:  0.4421 
F-statistic:  1086 on 12 and 16412 DF,  p-value: < 2.2e-16

Results:

Plot the residuals

avocado_resid <- avocado_train %>% 
  add_residuals(mod2b) %>% 
  select(-c(average_price, is_organic, region))

avocado_resid %>% 
  select(resid, everything()) %>% 
  ggpairs()

 plot: [1,1] [=>----------------------------------------------------------------------------------------------------]  2% est: 0s 
 plot: [1,2] [==>---------------------------------------------------------------------------------------------------]  3% est: 2s 
 plot: [1,3] [====>-------------------------------------------------------------------------------------------------]  5% est: 3s 
 plot: [1,4] [=====>------------------------------------------------------------------------------------------------]  6% est: 3s 
 plot: [1,5] [=======>----------------------------------------------------------------------------------------------]  8% est: 4s 
 plot: [1,6] [=========>--------------------------------------------------------------------------------------------]  9% est: 5s 
 plot: [1,7] [==========>-------------------------------------------------------------------------------------------] 11% est: 4s 
 plot: [1,8] [============>-----------------------------------------------------------------------------------------] 12% est: 4s 
 plot: [2,1] [=============>----------------------------------------------------------------------------------------] 14% est: 4s 
 plot: [2,2] [===============>--------------------------------------------------------------------------------------] 16% est: 4s 
 plot: [2,3] [=================>------------------------------------------------------------------------------------] 17% est: 4s 
 plot: [2,4] [==================>-----------------------------------------------------------------------------------] 19% est: 3s 
 plot: [2,5] [====================>---------------------------------------------------------------------------------] 20% est: 4s 
 plot: [2,6] [=====================>--------------------------------------------------------------------------------] 22% est: 4s 
 plot: [2,7] [=======================>------------------------------------------------------------------------------] 23% est: 4s 
 plot: [2,8] [=========================>----------------------------------------------------------------------------] 25% est: 3s 
 plot: [3,1] [==========================>---------------------------------------------------------------------------] 27% est: 3s 
 plot: [3,2] [============================>-------------------------------------------------------------------------] 28% est: 3s 
 plot: [3,3] [=============================>------------------------------------------------------------------------] 30% est: 3s 
 plot: [3,4] [===============================>----------------------------------------------------------------------] 31% est: 3s 
 plot: [3,5] [================================>---------------------------------------------------------------------] 33% est: 3s 
 plot: [3,6] [==================================>-------------------------------------------------------------------] 34% est: 3s 
 plot: [3,7] [====================================>-----------------------------------------------------------------] 36% est: 3s 
 plot: [3,8] [=====================================>----------------------------------------------------------------] 38% est: 3s 
 plot: [4,1] [=======================================>--------------------------------------------------------------] 39% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [4,2] [========================================>-------------------------------------------------------------] 41% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [4,3] [==========================================>-----------------------------------------------------------] 42% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [4,4] [============================================>---------------------------------------------------------] 44% est: 3s 
 plot: [4,5] [=============================================>--------------------------------------------------------] 45% est: 3s 
 plot: [4,6] [===============================================>------------------------------------------------------] 47% est: 3s 
 plot: [4,7] [================================================>-----------------------------------------------------] 48% est: 3s 
 plot: [4,8] [==================================================>---------------------------------------------------] 50% est: 2s 
 plot: [5,1] [====================================================>-------------------------------------------------] 52% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [5,2] [=====================================================>------------------------------------------------] 53% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [5,3] [=======================================================>----------------------------------------------] 55% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [5,4] [========================================================>---------------------------------------------] 56% est: 3s 
 plot: [5,5] [==========================================================>-------------------------------------------] 58% est: 3s 
 plot: [5,6] [============================================================>-----------------------------------------] 59% est: 3s 
 plot: [5,7] [=============================================================>----------------------------------------] 61% est: 3s 
 plot: [5,8] [===============================================================>--------------------------------------] 62% est: 3s Warning in ifelse(x >= 0, x, max + 1 + x) :
  restarting interrupted promise evaluation

 plot: [6,1] [================================================================>-------------------------------------] 64% est: 2s 
 plot: [6,2] [==================================================================>-----------------------------------] 66% est: 2s 
 plot: [6,3] [====================================================================>---------------------------------] 67% est: 2s 
 plot: [6,4] [=====================================================================>--------------------------------] 69% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [6,5] [=======================================================================>------------------------------] 70% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [6,6] [========================================================================>-----------------------------] 72% est: 2s 
 plot: [6,7] [==========================================================================>---------------------------] 73% est: 2s 
 plot: [6,8] [===========================================================================>--------------------------] 75% est: 2s 
 plot: [7,1] [=============================================================================>------------------------] 77% est: 2s 
 plot: [7,2] [===============================================================================>----------------------] 78% est: 2s 
 plot: [7,3] [================================================================================>---------------------] 80% est: 1s 
 plot: [7,4] [==================================================================================>-------------------] 81% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [7,5] [===================================================================================>------------------] 83% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [7,6] [=====================================================================================>----------------] 84% est: 1s 
 plot: [7,7] [=======================================================================================>--------------] 86% est: 1s 
 plot: [7,8] [========================================================================================>-------------] 88% est: 1s 
 plot: [8,1] [==========================================================================================>-----------] 89% est: 1s 
 plot: [8,2] [===========================================================================================>----------] 91% est: 1s 
 plot: [8,3] [=============================================================================================>--------] 92% est: 1s 
 plot: [8,4] [===============================================================================================>------] 94% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [8,5] [================================================================================================>-----] 95% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [8,6] [==================================================================================================>---] 97% est: 0s 
 plot: [8,7] [===================================================================================================>--] 98% est: 0s 
 plot: [8,8] [======================================================================================================]100% est: 0s 
                                                                                                                                  

Based on this, look at the following for the second predictor:

mod3a <- lm(average_price ~ is_organic + region + prop_single_bags,
            data = avocado_train)

autoplot(mod3a)

summary(mod3a)

Call:
lm(formula = average_price ~ is_organic + region + prop_single_bags, 
    data = avocado_train)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.01881 -0.18403 -0.02364  0.15186  1.49736 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)          0.959189   0.012358  77.615  < 2e-16 ***
is_organicTRUE       0.566053   0.005081 111.408  < 2e-16 ***
regionGreat Lakes   -0.090874   0.009709  -9.360  < 2e-16 ***
regionMidsouth      -0.059627   0.009774  -6.101 1.08e-09 ***
regionNortheast      0.163889   0.009215  17.785  < 2e-16 ***
regionPlains         0.006730   0.013861   0.486    0.627    
regionSouth Central -0.291893   0.011185 -26.096  < 2e-16 ***
regionSoutheast      0.007834   0.009873   0.793    0.428    
regionTotalUS       -0.115107   0.017988  -6.399 1.60e-10 ***
regionWest          -0.110474   0.009326 -11.846  < 2e-16 ***
prop_single_bags     0.314894   0.011003  28.619  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2859 on 16414 degrees of freedom
Multiple R-squared:  0.4925,    Adjusted R-squared:  0.4922 
F-statistic:  1593 on 10 and 16414 DF,  p-value: < 2.2e-16

Results:

mod3b <- lm(average_price ~ is_organic + region + month,
            data = avocado_train)

autoplot(mod3b)

summary(mod3b)

Call:
lm(formula = average_price ~ is_organic + region + month, data = avocado_train)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.11408 -0.16736 -0.01074  0.15318  1.52588 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)          1.244203   0.007400 168.138  < 2e-16 ***
is_organicTRUE       0.496619   0.004309 115.257  < 2e-16 ***
regionGreat Lakes   -0.133220   0.009266 -14.377  < 2e-16 ***
regionMidsouth      -0.114344   0.009263 -12.345  < 2e-16 ***
regionNortheast      0.078579   0.008417   9.336  < 2e-16 ***
regionPlains        -0.051701   0.013236  -3.906 9.42e-05 ***
regionSouth Central -0.352245   0.010598 -33.236  < 2e-16 ***
regionSoutheast     -0.057636   0.009277  -6.213 5.34e-10 ***
regionTotalUS       -0.164598   0.017296  -9.517  < 2e-16 ***
regionWest          -0.161862   0.008835 -18.321  < 2e-16 ***
month.L              0.211679   0.007327  28.889  < 2e-16 ***
month.Q             -0.153842   0.007436 -20.688  < 2e-16 ***
month.C             -0.188233   0.007447 -25.275  < 2e-16 ***
month^4             -0.073374   0.007389  -9.930  < 2e-16 ***
month^5             -0.007427   0.007492  -0.991  0.32152    
month^6              0.060358   0.007600   7.942 2.12e-15 ***
month^7             -0.003000   0.007431  -0.404  0.68644    
month^8              0.029506   0.007468   3.951 7.82e-05 ***
month^9              0.014368   0.007684   1.870  0.06153 .  
month^10            -0.024322   0.007717  -3.152  0.00163 ** 
month^11             0.012001   0.007749   1.549  0.12147    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2761 on 16404 degrees of freedom
Multiple R-squared:  0.527, Adjusted R-squared:  0.5264 
F-statistic: 913.7 on 20 and 16404 DF,  p-value: < 2.2e-16
anova(mod2b, mod3b)
Analysis of Variance Table

Model 1: average_price ~ is_organic + region
Model 2: average_price ~ is_organic + region + month
  Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
1  16415 1408.2                                  
2  16404 1250.3 11    157.96 188.41 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Results:

Plot the residuals

avocado_resid <- avocado_train %>% 
  add_residuals(mod3b) %>% 
  select(-c(average_price, is_organic, region, month))

avocado_resid %>% 
  select(resid, everything()) %>% 
  ggpairs()

 plot: [1,1] [=>----------------------------------------------------------------------------------------------------]  2% est: 0s 
 plot: [1,2] [===>--------------------------------------------------------------------------------------------------]  4% est: 2s 
 plot: [1,3] [=====>------------------------------------------------------------------------------------------------]  6% est: 2s 
 plot: [1,4] [=======>----------------------------------------------------------------------------------------------]  8% est: 2s 
 plot: [1,5] [=========>--------------------------------------------------------------------------------------------] 10% est: 2s 
 plot: [1,6] [===========>------------------------------------------------------------------------------------------] 12% est: 2s 
 plot: [1,7] [==============>---------------------------------------------------------------------------------------] 14% est: 2s 
 plot: [2,1] [================>-------------------------------------------------------------------------------------] 16% est: 2s 
 plot: [2,2] [==================>-----------------------------------------------------------------------------------] 18% est: 2s 
 plot: [2,3] [====================>---------------------------------------------------------------------------------] 20% est: 2s 
 plot: [2,4] [======================>-------------------------------------------------------------------------------] 22% est: 2s 
 plot: [2,5] [========================>-----------------------------------------------------------------------------] 24% est: 2s 
 plot: [2,6] [==========================>---------------------------------------------------------------------------] 27% est: 2s 
 plot: [2,7] [============================>-------------------------------------------------------------------------] 29% est: 2s 
 plot: [3,1] [==============================>-----------------------------------------------------------------------] 31% est: 2s 
 plot: [3,2] [================================>---------------------------------------------------------------------] 33% est: 2s 
 plot: [3,3] [==================================>-------------------------------------------------------------------] 35% est: 2s 
 plot: [3,4] [====================================>-----------------------------------------------------------------] 37% est: 2s 
 plot: [3,5] [=======================================>--------------------------------------------------------------] 39% est: 2s 
 plot: [3,6] [=========================================>------------------------------------------------------------] 41% est: 2s 
 plot: [3,7] [===========================================>----------------------------------------------------------] 43% est: 2s 
 plot: [4,1] [=============================================>--------------------------------------------------------] 45% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [4,2] [===============================================>------------------------------------------------------] 47% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [4,3] [=================================================>----------------------------------------------------] 49% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [4,4] [===================================================>--------------------------------------------------] 51% est: 2s 
 plot: [4,5] [=====================================================>------------------------------------------------] 53% est: 2s 
 plot: [4,6] [=======================================================>----------------------------------------------] 55% est: 2s 
 plot: [4,7] [=========================================================>--------------------------------------------] 57% est: 2s 
 plot: [5,1] [===========================================================>------------------------------------------] 59% est: 2s 
 plot: [5,2] [=============================================================>----------------------------------------] 61% est: 2s 
 plot: [5,3] [================================================================>-------------------------------------] 63% est: 1s 
 plot: [5,4] [==================================================================>-----------------------------------] 65% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [5,5] [====================================================================>---------------------------------] 67% est: 1s 
 plot: [5,6] [======================================================================>-------------------------------] 69% est: 1s 
 plot: [5,7] [========================================================================>-----------------------------] 71% est: 1s 
 plot: [6,1] [==========================================================================>---------------------------] 73% est: 1s 
 plot: [6,2] [============================================================================>-------------------------] 76% est: 1s 
 plot: [6,3] [==============================================================================>-----------------------] 78% est: 1s 
 plot: [6,4] [================================================================================>---------------------] 80% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [6,5] [==================================================================================>-------------------] 82% est: 1s 
 plot: [6,6] [====================================================================================>-----------------] 84% est: 1s 
 plot: [6,7] [======================================================================================>---------------] 86% est: 1s 
 plot: [7,1] [=========================================================================================>------------] 88% est: 0s 
 plot: [7,2] [===========================================================================================>----------] 90% est: 0s 
 plot: [7,3] [=============================================================================================>--------] 92% est: 0s 
 plot: [7,4] [===============================================================================================>------] 94% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 plot: [7,5] [=================================================================================================>----] 96% est: 0s 
 plot: [7,6] [===================================================================================================>--] 98% est: 0s 
 plot: [7,7] [======================================================================================================]100% est: 0s 
                                                                                                                                  

Based on this, look at the following for the fourth predictor:

mod4a <- lm(average_price ~ is_organic + region + month + prop_single_bags,
            data = avocado_train)

autoplot(mod4a)

summary(mod4a)

Call:
lm(formula = average_price ~ is_organic + region + month + prop_single_bags, 
    data = avocado_train)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.04602 -0.17017 -0.01395  0.15441  1.53347 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)          0.965397   0.011655  82.832  < 2e-16 ***
is_organicTRUE       0.566487   0.004780 118.507  < 2e-16 ***
regionGreat Lakes   -0.090459   0.009125  -9.914  < 2e-16 ***
regionMidsouth      -0.060191   0.009186  -6.552 5.84e-11 ***
regionNortheast      0.164601   0.008664  18.999  < 2e-16 ***
regionPlains         0.007735   0.013026   0.594 0.552665    
regionSouth Central -0.289919   0.010513 -27.576  < 2e-16 ***
regionSoutheast      0.007950   0.009280   0.857 0.391667    
regionTotalUS       -0.116079   0.016903  -6.867 6.78e-12 ***
regionWest          -0.109684   0.008765 -12.514  < 2e-16 ***
month.L              0.222057   0.007137  31.112  < 2e-16 ***
month.Q             -0.136359   0.007258 -18.787  < 2e-16 ***
month.C             -0.186582   0.007246 -25.749  < 2e-16 ***
month^4             -0.077521   0.007191 -10.781  < 2e-16 ***
month^5             -0.002371   0.007291  -0.325 0.744992    
month^6              0.061071   0.007394   8.259  < 2e-16 ***
month^7             -0.004238   0.007230  -0.586 0.557812    
month^8              0.034461   0.007268   4.741 2.14e-06 ***
month^9              0.016384   0.007477   2.191 0.028444 *  
month^10            -0.026981   0.007509  -3.593 0.000328 ***
month^11             0.008597   0.007541   1.140 0.254241    
prop_single_bags     0.316043   0.010389  30.420  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2686 on 16403 degrees of freedom
Multiple R-squared:  0.5522,    Adjusted R-squared:  0.5517 
F-statistic: 963.3 on 21 and 16403 DF,  p-value: < 2.2e-16
anova(mod3b, mod4a)
Analysis of Variance Table

Model 1: average_price ~ is_organic + region + month
Model 2: average_price ~ is_organic + region + month + prop_single_bags
  Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
1  16404 1250.3                                  
2  16403 1183.5  1    66.769 925.41 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Results:

mod4b <- lm(average_price ~ is_organic + region + month + year,
            data = avocado_train)

autoplot(mod4b)

summary(mod4b)

Call:
lm(formula = average_price ~ is_organic + region + month + year, 
    data = avocado_train)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.18315 -0.15607 -0.00215  0.14893  1.42981 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)          1.206290   0.007770 155.259  < 2e-16 ***
is_organicTRUE       0.497309   0.004152 119.787  < 2e-16 ***
regionGreat Lakes   -0.133280   0.008928 -14.928  < 2e-16 ***
regionMidsouth      -0.113052   0.008925 -12.667  < 2e-16 ***
regionNortheast      0.079915   0.008110   9.854  < 2e-16 ***
regionPlains        -0.050518   0.012754  -3.961 7.49e-05 ***
regionSouth Central -0.350466   0.010212 -34.320  < 2e-16 ***
regionSoutheast     -0.056554   0.008939  -6.327 2.57e-10 ***
regionTotalUS       -0.164547   0.016664  -9.874  < 2e-16 ***
regionWest          -0.161476   0.008512 -18.970  < 2e-16 ***
month.L              0.224338   0.007451  30.109  < 2e-16 ***
month.Q             -0.168833   0.007329 -23.037  < 2e-16 ***
month.C             -0.189834   0.007180 -26.440  < 2e-16 ***
month^4             -0.071305   0.007157  -9.962  < 2e-16 ***
month^5             -0.014214   0.007268  -1.956 0.050514 .  
month^6              0.056391   0.007335   7.688 1.57e-14 ***
month^7             -0.001947   0.007167  -0.272 0.785934    
month^8              0.024216   0.007219   3.355 0.000797 ***
month^9              0.009515   0.007429   1.281 0.200263    
month^10            -0.017405   0.007448  -2.337 0.019460 *  
month^11             0.009856   0.007469   1.319 0.187026    
year2016            -0.038593   0.005299  -7.283 3.40e-13 ***
year2017             0.137662   0.005285  26.048  < 2e-16 ***
year2018             0.087480   0.009366   9.340  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.266 on 16401 degrees of freedom
Multiple R-squared:  0.561, Adjusted R-squared:  0.5603 
F-statistic: 911.1 on 23 and 16401 DF,  p-value: < 2.2e-16
anova(mod3b, mod4b)
Analysis of Variance Table

Model 1: average_price ~ is_organic + region + month
Model 2: average_price ~ is_organic + region + month + year
  Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
1  16404 1250.3                                  
2  16401 1160.5  3    89.818 423.14 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Results:

mod4c <- lm(average_price ~ is_organic + region + month + ln_total_bags,
            data = avocado_train)

autoplot(mod4c)

summary(mod4c)

Call:
lm(formula = average_price ~ is_organic + region + month + ln_total_bags, 
    data = avocado_train)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.05316 -0.16864 -0.01189  0.15567  1.48415 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)          1.767770   0.019070  92.697  < 2e-16 ***
is_organicTRUE       0.352461   0.006423  54.876  < 2e-16 ***
regionGreat Lakes   -0.150593   0.009047 -16.646  < 2e-16 ***
regionMidsouth      -0.135357   0.009052 -14.953  < 2e-16 ***
regionNortheast      0.067748   0.008208   8.254  < 2e-16 ***
regionPlains        -0.043875   0.012898  -3.402 0.000672 ***
regionSouth Central -0.327138   0.010360 -31.577  < 2e-16 ***
regionSoutheast     -0.070486   0.009049  -7.790 7.12e-15 ***
regionTotalUS        0.020672   0.017971   1.150 0.250036    
regionWest          -0.162448   0.008607 -18.873  < 2e-16 ***
month.L              0.204078   0.007143  28.569  < 2e-16 ***
month.Q             -0.151380   0.007245 -20.893  < 2e-16 ***
month.C             -0.180654   0.007260 -24.883  < 2e-16 ***
month^4             -0.070975   0.007200  -9.858  < 2e-16 ***
month^5             -0.002157   0.007301  -0.295 0.767686    
month^6              0.060066   0.007404   8.112 5.31e-16 ***
month^7             -0.001634   0.007240  -0.226 0.821448    
month^8              0.030842   0.007276   4.239 2.26e-05 ***
month^9              0.015692   0.007487   2.096 0.036101 *  
month^10            -0.024711   0.007519  -3.287 0.001016 ** 
month^11             0.009489   0.007550   1.257 0.208846    
ln_total_bags       -0.043995   0.001484 -29.655  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.269 on 16403 degrees of freedom
Multiple R-squared:  0.551, Adjusted R-squared:  0.5505 
F-statistic: 958.7 on 21 and 16403 DF,  p-value: < 2.2e-16
anova(mod3b, mod4c)
Analysis of Variance Table

Model 1: average_price ~ is_organic + region + month
Model 2: average_price ~ is_organic + region + month + ln_total_bags
  Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
1  16404 1250.3                                  
2  16403 1186.6  1     63.62 879.42 < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Results:

Test the model

predictions_test <- avocado_test %>% 
  add_predictions(mod4c) %>% 
  select(average_price, pred)

predictions_test
predictions_test <- predictions_test %>% 
  mutate(sq_err = (pred - average_price)^2)

mse_test <- mean(predictions_test$sq_err)
mse_test # normally this would be sqrt'd -> RMSE
[1] 0.08327954
sqrt(mse_test)
[1] 0.2885819
predictions_train <- avocado_train %>% 
  add_predictions(mod4c) %>% 
  select(average_price, pred)

predictions_train
predictions_train <- predictions_train %>% 
  mutate(sq_err = (pred - average_price) ^ 2)

mse_train <- mean(predictions_train$sq_err)
mse_train
[1] 0.07224616
sqrt(mse_train)
[1] 0.2687865

K-fold cross validation

library(caret)
Loading required package: lattice
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     

Attaching package: ‘caret’

The following object is masked from ‘package:purrr’:

    lift
cv_10_fold <- trainControl(method = "cv",
                           number = 10,
                           savePredictions = TRUE)

avo_model <- train(average_price ~ is_organic + region + month + year,
                   data = avocado_train,
                   trControl = cv_10_fold,
                   method = "lm")
avo_model$pred
avo_model$resample
mean(avo_model$resample$RMSE)
[1] 0.2661194
mean(avo_model$resample$Rsquared)
[1] 0.5600208
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKQ2FsbCBpbiBsaWJyYXJpZXMKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KG1vZGVscikKbGlicmFyeShsZWFwcykKbGlicmFyeShHR2FsbHkpCmxpYnJhcnkoZ2dmb3J0aWZ5KQpsaWJyYXJ5KGx1YnJpZGF0ZSkKYGBgCgpSZWFkIGluIGRhdGEgc2V0CmBgYHtyfQphdm9jYWRvIDwtIHJlYWRfY3N2KCJkYXRhL2F2b2NhZG8uY3N2IikgJT4lIGphbml0b3I6OmNsZWFuX25hbWVzKCkKYGBgCgpVc2UgYGdsaW1wc2VgCmBgYHtyfQpnbGltcHNlKGF2b2NhZG8pCmBgYAoKVXNlIGBza2ltYApgYGB7cn0KbGlicmFyeShza2ltcikKCnNraW0oYXZvY2FkbykKYGBgCgpMb29rIGF0IGZpcnN0IHNpeCByb3dzCmBgYHtyfQphdm9jYWRvICU+JSAKICBoZWFkKCkKYGBgCgoKTG9vayBhdCBkaXN0cmlidXRpb24gb2YgYXZlcmFnZSBwcmljZXMKYGBge3J9CmF2b2NhZG8gJT4lIAogIGdncGxvdChhZXMoeCA9IGF2ZXJhZ2VfcHJpY2UpKSArCiAgZ2VvbV9oaXN0b2dyYW0oKSAKYGBgCgpGaW5kIGlmIHRoZXJlIGFyZSBhbnkgYWxpYXNlcwpgYGB7cn0KYWxpYXMoYXZlcmFnZV9wcmljZSB+IC4sCiAgICAgIGRhdGEgPSBhdm9jYWRvKQpgYGAKCgpGaW5kIGFsbCB0aGUgZGlzdGluY3QgcmVnaW9ucwpgYGB7cn0KYXZvY2FkbyAlPiUgCiAgZGlzdGluY3QocmVnaW9uKSAKYGBgCgpGZWF0dXJlIEVuZ2luZWVyaW5nCgoqIGV4dHJhY3QgbW9udGggZnJvbSBgZGF0ZWAKKiBtdXRhdGUgYSBgdG90YWxfc2luZ2xlYCBjb2x1bW4gKHRvdGFsX3ZvbHVtZSA9IHRvdGFsX3NpbmdsZSArIHRvdGFsX2JhZ3MpCiogY2hhbmdlIGB5ZWFyYCB0byBhIGZhY3RvcgoqIHJlLWNvZGUgYHJlZ2lvbmAgaW50byBncm91cHMKKiBjaGFuZ2UgYHR5cGVgIHRvIGxvZ2ljYWwKCmBgYHtyfQphdm9jYWRvIDwtIGF2b2NhZG8gJT4lIAogIG11dGF0ZShtb250aCA9IG1vbnRoKGRhdGUsIGxhYmVsID0gVFJVRSwgYWJiciA9IEZBTFNFKSkgJT4lIAogIG11dGF0ZSh0b3RhbF9zaW5nbGUgPSB4NDA0NiArIHg0MjI1ICsgeDQ3NzAsCiAgICAgICAgIC5hZnRlciA9IHRvdGFsX3ZvbHVtZSkgJT4lIAogIG11dGF0ZSh5ZWFyID0gYXMuZmFjdG9yKHllYXIpKSAlPiUgCiAgbXV0YXRlKHJlZ2lvbiA9IGNhc2Vfd2hlbigKICAgIHN0cl9kZXRlY3QocmVnaW9uLCAiQWxiYW55IikgfiAiTm9ydGhlYXN0IiwKICAgIHN0cl9kZXRlY3QocmVnaW9uLCAiQXRsYW50YSIpIH4gIlNvdXRoZWFzdCIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIkJhbHRpbW9yZVdhc2hpbmd0b24iKSB+ICJOb3J0aGVhc3QiLAogICAgc3RyX2RldGVjdChyZWdpb24sICJCb2lzZSIpIH4gIldlc3QiLAogICAgc3RyX2RldGVjdChyZWdpb24sICJCb3N0b24iKSB+ICJOb3J0aGVhc3QiLAogICAgc3RyX2RldGVjdChyZWdpb24sICJCdWZmYWxvUm9jaGVzdGVyIikgfiAiTm9ydGhlYXN0IiwKICAgIHN0cl9kZXRlY3QocmVnaW9uLCAiQ2FsaWZvcm5pYSIpIH4gIkNhbGlmb3JuaWEiLAogICAgc3RyX2RldGVjdChyZWdpb24sICJDaGFybG90dGUiKSB+ICJNaWRzb3V0aCIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIkNoaWNhZ28iKSB+ICJHcmVhdCBMYWtlcyIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIkNpbmNpbm5hdGlEYXl0b24iKSB+ICJHcmVhdCBMYWtlcyIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIkNvbHVtYnVzIikgfiAiR3JlYXQgTGFrZXMiLAogICAgc3RyX2RldGVjdChyZWdpb24sICJEYWxsYXNGdFdvcnRoIikgfiAiU291dGggQ2VudHJhbCIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIkRlbnZlciIpIH4gIldlc3QiLAogICAgc3RyX2RldGVjdChyZWdpb24sICJEZXRyb2l0IikgfiAiR3JlYXQgTGFrZXMiLAogICAgc3RyX2RldGVjdChyZWdpb24sICJHcmFuZFJhcGlkcyIpIH4gIkdyZWF0IExha2VzIiwKICAgIHN0cl9kZXRlY3QocmVnaW9uLCAiR3JlYXRMYWtlcyIpIH4gIkdyZWF0IExha2VzIiwKICAgIHN0cl9kZXRlY3QocmVnaW9uLCAiSGFycmlzYnVyZ1NjcmFudG9uIikgfiAiTm9ydGhlYXN0IiwKICAgIHN0cl9kZXRlY3QocmVnaW9uLCAiSG91c3RvbiIpIH4gIlNvdXRoIENlbnRyYWwiLAogICAgc3RyX2RldGVjdChyZWdpb24sICJIYXJ0Zm9yZFNwcmluZ2ZpZWxkIikgfiAiTm9ydGhlYXN0IiwKICAgIHN0cl9kZXRlY3QocmVnaW9uLCAiSW5kaWFuYXBvbGlzIikgfiAiR3JlYXQgTGFrZXMiLAogICAgc3RyX2RldGVjdChyZWdpb24sICJKYWNrc29udmlsbGUiKSB+ICJTb3V0aGVhc3QiLAogICAgc3RyX2RldGVjdChyZWdpb24sICJMYXNWZWdhcyIpIH4gIldlc3QiLAogICAgc3RyX2RldGVjdChyZWdpb24sICJMb3NBbmdlbGVzIikgfiAiQ2FsaWZvcm5pYSIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIkxvdWlzdmlsbGUiKSB+ICJNaWRzb3V0aCIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIk1pYW1pRnRMYXVkZXJkYWxlIikgfiAiU291dGhlYXN0IiwKICAgIHN0cl9kZXRlY3QocmVnaW9uLCAiTWlkc291dGgiKSB+ICJNaWRzb3V0aCIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIk5hc2h2aWxsZSIpIH4gIk1pZHNvdXRoIiwKICAgIHN0cl9kZXRlY3QocmVnaW9uLCAiTmV3T3JsZWFuc01vYmlsZSIpIH4gIlNvdXRoIENlbnRyYWwiLAogICAgc3RyX2RldGVjdChyZWdpb24sICJOZXdZb3JrIikgfiAiTm9ydGhlYXN0IiwKICAgIHN0cl9kZXRlY3QocmVnaW9uLCAiTm9ydGhlYXN0IikgfiAiTm9ydGhlYXN0IiwKICAgIHN0cl9kZXRlY3QocmVnaW9uLCAiTm9ydGhlcm5OZXdFbmdsYW5kIikgfiAiTm9ydGhlYXN0IiwKICAgIHN0cl9kZXRlY3QocmVnaW9uLCAiT3JsYW5kbyIpIH4gIlNvdXRoZWFzdCIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIlBoaWxhZGVscGhpYSIpIH4gIk5vcnRoZWFzdCIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIlBob2VuaXhUdWNzb24iKSB+ICJXZXN0IiwKICAgIHN0cl9kZXRlY3QocmVnaW9uLCAiUGl0dHNidXJnaCIpIH4gIk5vcnRoZWFzdCIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIlBsYWlucyIpIH4gIlBsYWlucyIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIlBvcnRsYW5kIikgfiAiV2VzdCIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIlJhbGVpZ2hHcmVlbnNib3JvIikgfiAiTWlkc291dGgiLAogICAgc3RyX2RldGVjdChyZWdpb24sICJSaWNobW9uZE5vcmZvbGsiKSB+ICJNaWRzb3V0aCIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIlJvYW5va2UiKSB+ICJNaWRzb3V0aCIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIlNhY3JhbWVudG8iKSB+ICJDYWxpZm9ybmlhIiwKICAgIHN0cl9kZXRlY3QocmVnaW9uLCAiU2FuRGllZ28iKSB+ICJDYWxpZm9ybmlhIiwKICAgIHN0cl9kZXRlY3QocmVnaW9uLCAiU2FuRnJhbmNpc2NvIikgfiAiQ2FsaWZvcm5pYSIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIlNlYXR0bGUiKSB+ICJXZXN0IiwKICAgIHN0cl9kZXRlY3QocmVnaW9uLCAiU291dGhDYXJvbGluYSIpIH4gIlNvdXRoZWFzdCIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIlNvdXRoQ2VudHJhbCIpIH4gIlNvdXRoIENlbnRyYWwiLAogICAgc3RyX2RldGVjdChyZWdpb24sICJTb3V0aGVhc3QiKSB+ICJTb3V0aGVhc3QiLAogICAgc3RyX2RldGVjdChyZWdpb24sICJTcG9rYW5lIikgfiAiV2VzdCIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIlN0TG91aXMiKSB+ICJQbGFpbnMiLAogICAgc3RyX2RldGVjdChyZWdpb24sICJTeXJhY3VzZSIpIH4gIk5vcnRoZWFzdCIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIlRhbXBhIikgfiAiU291dGhlYXN0IiwKICAgIHN0cl9kZXRlY3QocmVnaW9uLCAiVG90YWxVUyIpIH4gIlRvdGFsVVMiLAogICAgc3RyX2RldGVjdChyZWdpb24sICJXZXN0IikgfiAiV2VzdCIsCiAgICBzdHJfZGV0ZWN0KHJlZ2lvbiwgIldlc3RUZXhOZXdNZXhpY28iKSB+ICJTb3V0aCBDZW50cmFsIiwKICAgIFRSVUUgfiByZWdpb24KICAgICkpICU+JSAKICBtdXRhdGUoaXNfb3JnYW5pYyA9IHR5cGUgPT0gIm9yZ2FuaWMiKQpgYGAKCgpGZWF0dXJlIEVuZ2luZWVyaW5nIGNvbnQKCiogZHJvcDoKICArIGlkCiAgKyBkYXRlCiAgKyB0b3RhbF92b2x1bWUKICArIHg0MDQ2CiAgKyB4NDIyNQogICsgeDQ3NzAKICArIHNtYWxsX2JhZ3MKICArIGxhcmdlX2JhZ3MKICArIHhfbGFyZ2VfYmFncwoKYGBge3J9CmF2b2NhZG8gPC0gYXZvY2FkbyAlPiUgCiAgc2VsZWN0KGF2ZXJhZ2VfcHJpY2UsIHRvdGFsX3NpbmdsZSwgdG90YWxfYmFncywgaXNfb3JnYW5pYywgeWVhciwgcmVnaW9uLCBtb250aCkKYGBgCgpGdXJ0aGVyIGZlYXR1cmUgZW5naW5lZXJpbmc6CgoqIGNyZWF0ZSBgbG5fdG90YWxfc2luZ2xlYAoqIGNyZWF0ZSBgbG5fdG90YWxfYmFnc2AKKiBjcmVhdGUgYHByb3Bfc2luZ2xlX2JhZ3NgCgpgYGB7cn0KYXZvY2FkbyA8LSBhdm9jYWRvICU+JSAKICBtdXRhdGUobG5fdG90YWxfc2luZ2xlID0gbG9nKHRvdGFsX3NpbmdsZSksCiAgICAgICAgIGxuX3RvdGFsX2JhZ3MgPSBsb2codG90YWxfYmFncyArIDEpLAogICAgICAgICBwcm9wX3NpbmdsZV9iYWdzID0gdG90YWxfc2luZ2xlIC8gKHRvdGFsX3NpbmdsZSArIHRvdGFsX2JhZ3MpKQpgYGAKCgpTcGxpdCB0aGUgZGF0YSBpbnRvIHRlc3QvdHJhaW4gZGF0YSBzZXRzICg5MC8xMCkKCmBgYHtyfQpuX3JvdyA8LSBucm93KGF2b2NhZG8pCgp0ZXN0X2luZGV4IDwtIHNhbXBsZSgxOm5fcm93LCBzaXplID0gbl9yb3cgKiAwLjEpCgphdm9jYWRvX3Rlc3QgPC0gc2xpY2UoYXZvY2FkbywgdGVzdF9pbmRleCkKYXZvY2Fkb190cmFpbiA8LSBzbGljZShhdm9jYWRvLCAtdGVzdF9pbmRleCkKYGBgCgoKSW5pdGlhbCBnZ3BhaXJzCmBgYHtyfQphdm9jYWRvX3RyYWluICU+JSAKICBzZWxlY3QoYXZlcmFnZV9wcmljZSwgZXZlcnl0aGluZygpKSAlPiUgCiAgZ2dwYWlycygpCmBgYApCYXNlZCBvbiB0aGlzLCBsb29rIGF0IHRoZSBmb2xsb3dpbmcgZm9yIHRoZSBmaXJzdCBwcmVkaWN0b3I6CgoqIGBsbl90b3RhbF9iYWdgCiogYGlzX29yZ2FuaWNgCgpQbG90IGBsbl90b3RhbF9iYWdgIGhpc3RvZ3JhbQpgYGB7cn0KYXZvY2Fkb190cmFpbiAlPiUgCiAgZ2dwbG90KGFlcyh4ID0gbG5fdG90YWxfYmFncykpKwogIGdlb21faGlzdG9ncmFtKCkKYGBgCgoKUGxvdCBgaXNfb3JnYW5pY2AgYm94cGxvdApgYGB7cn0KYXZvY2Fkb190cmFpbiAlPiUgCiAgZ2dwbG90KGFlcyh4ID0gaXNfb3JnYW5pYywKICAgICAgICAgICAgIHkgPSBhdmVyYWdlX3ByaWNlKSkrCiAgZ2VvbV9ib3hwbG90KCkKYGBgCgpDcmVhdGUgZmlyc3QgbW9kZWwKCmBgYHtyfQptb2QxYSA8LSBsbShhdmVyYWdlX3ByaWNlIH4gbG5fdG90YWxfYmFncywKICAgICAgICAgICAgZGF0YSA9IGF2b2NhZG9fdHJhaW4pCgphdXRvcGxvdChtb2QxYSkKYGBgCgoKYGBge3J9CnN1bW1hcnkobW9kMWEpCmBgYAoKUmVzdWx0czoKCiogUnZGIHRoZXJlIHNlZW1zIHRvIGJlIGEgcGF0dGVybgoqIFNjYWxlLUxvY2F0aW9uIGlzICdtb3JlIHdyb25nJyBpbiB0aGUgbWlkZGxlIG9mIHRoZSBmaXR0ZWQgdmFsdWVzCiogTm9ybWFsIFEtUSBsaWZ0cyBvZmYgdGhlIGxpbmUgdG93YXJkcyB0aGUgaGlnaGVyIHF1YW50aWxlcwoqIEFjY291bnRzIGZvciB+IDMzJSBvZiB0aGUgdmFyaWF0aW9uCgoKYGBge3J9Cm1vZDFiIDwtIGxtKGF2ZXJhZ2VfcHJpY2UgfiBpc19vcmdhbmljLAogICAgICAgICAgICBkYXRhID0gYXZvY2Fkb190cmFpbikKCmF1dG9wbG90KG1vZDFiKQpgYGAKCgpgYGB7cn0Kc3VtbWFyeShtb2QxYikKYGBgCgoKUmVzdWx0czoKCiogUnZGIGNhbid0IHRlbGwKKiBTY2FsZS1Mb2NhdGlvbiBjYW4ndCB0ZWxsCiogTm9ybWFsIFEtUSBsaWZ0cyBvZmYgdGhlIGxpbmUgdG93YXJkcyB0aGUgaGlnaGVyIHF1YW50aWxlcwoqIEFjY291bnRzIGZvciB+IDM3JSBvZiB0aGUgdmFyaWF0aW9uCiogKipDaG9vc2UgdGhpcyBmb3IgdGhlIGZpcnN0IG1vZGVsKioKCkxvb2sgYXQgYW5kIHBsb3QgcmVzaWR1YWxzOgoKYGBge3J9CmF2b2NhZG9fcmVzaWQgPC0gYXZvY2Fkb190cmFpbiAlPiUgCiAgYWRkX3Jlc2lkdWFscyhtb2QxYikgJT4lIAogIHNlbGVjdCgtYyhhdmVyYWdlX3ByaWNlLCBpc19vcmdhbmljKSkKCmF2b2NhZG9fcmVzaWQgJT4lIAogIHNlbGVjdChyZXNpZCwgZXZlcnl0aGluZygpKSAlPiUgCiAgZ2dwYWlycygpCmBgYAoKQmFzZWQgb24gdGhpcywgbG9vayBhdCB0aGUgZm9sbG93aW5nIGZvciB0aGUgc2Vjb25kIHByZWRpY3RvcjoKCiogYGxuX3RvdGFsX2JhZ2AKKiBgcmVnaW9uYAoqIGBtb250aGAKClBsb3QgYHJlZ2lvbmAgYm94cGxvdApgYGB7cn0KYXZvY2Fkb190cmFpbiAlPiUgCiAgZ2dwbG90KGFlcyh4ID0gcmVnaW9uLAogICAgICAgICAgICAgeSA9IGF2ZXJhZ2VfcHJpY2UpKSsKICBnZW9tX2JveHBsb3QoKQpgYGAKCgpQbG90IHRoZSBgbW9udGhgIGJveHBsb3QKYGBge3J9CmF2b2NhZG9fdHJhaW4gJT4lIAogIGdncGxvdChhZXMoeCA9IG1vbnRoLAogICAgICAgICAgICAgeSA9IGF2ZXJhZ2VfcHJpY2UpKSsKICBnZW9tX2JveHBsb3QoKQpgYGAKCgpgYGB7cn0KbW9kMmEgPC0gbG0oYXZlcmFnZV9wcmljZSB+IGlzX29yZ2FuaWMgKyBsbl90b3RhbF9iYWdzLAogICAgICAgICAgICBkYXRhID0gYXZvY2Fkb190cmFpbikKCmF1dG9wbG90KG1vZDJhKQpgYGAKCgpgYGB7cn0Kc3VtbWFyeShtb2QyYSkKYGBgCgpSZXN1bHRzOgoKKiBSdkYgdGhlcmUgc2VlbXMgdG8gYmUgYSBwYXR0ZXJuLCB0aGVyZSdzIGEgZ2FwIGluIHRoZSBmaXR0ZWQgdmFsdWVzCiogU2NhbGUtTG9jYXRpb24gaXMgJ21vcmUgd3JvbmcnIHRoZSBoaWdoZXIgdGhlIGZpdHRlZCB2YWx1ZXMsIHNsaWdodCBjb25lIHNoYXBlCiogTm9ybWFsIFEtUSBsaWZ0cyBvZmYgdGhlIGxpbmUgdG93YXJkcyB0aGUgaGlnaGVyIHF1YW50aWxlcwoqIEFjY291bnRzIGZvciB+IDQyJSBvZiB0aGUgdmFyaWF0aW9uCgpgYGB7cn0KbW9kMmIgPC0gbG0oYXZlcmFnZV9wcmljZSB+IGlzX29yZ2FuaWMgKyByZWdpb24sCiAgICAgICAgICAgIGRhdGEgPSBhdm9jYWRvX3RyYWluKQoKYXV0b3Bsb3QobW9kMmIpCmBgYAoKCmBgYHtyfQpzdW1tYXJ5KG1vZDJiKQpgYGAKClJlc3VsdHM6CgoqIFJ2RiBjYW4ndCB0ZWxsCiogU2NhbGUtTG9jYXRpb24gY2FuJ3QgdGVsbAoqIE5vcm1hbCBRLVEgbGlmdHMgb2ZmIHRoZSBsaW5lIGF0IHRoZSBiZWdpbm5pbmcgYW5kIHRvd2FyZHMgdGhlIGhpZ2hlciBxdWFudGlsZXMKKiBBY2NvdW50cyBmb3IgfiA0NyUgb2YgdGhlIHZhcmlhdGlvbgoqICoqQ2hvb3NlIHRoaXMgcmVzdWx0KioKCmBgYHtyfQptb2QyYyA8LSBsbShhdmVyYWdlX3ByaWNlIH4gaXNfb3JnYW5pYyArIG1vbnRoLAogICAgICAgICAgICBkYXRhID0gYXZvY2Fkb190cmFpbikKCmF1dG9wbG90KG1vZDJjKQpgYGAKCgpgYGB7cn0Kc3VtbWFyeShtb2QyYykKYGBgCgpSZXN1bHRzOgoKKiBSdkYgY2FuJ3QgdGVsbAoqIFNjYWxlLUxvY2F0aW9uIGNhbid0IHRlbGwKKiBOb3JtYWwgUS1RIGxpZnRzIG9mZiB0aGUgbGluZSB0b3dhcmRzIHRoZSBoaWdoZXIgcXVhbnRpbGVzCiogQWNjb3VudHMgZm9yIH4gNDQlIG9mIHRoZSB2YXJpYXRpb24KClBsb3QgdGhlIHJlc2lkdWFscwoKYGBge3J9CmF2b2NhZG9fcmVzaWQgPC0gYXZvY2Fkb190cmFpbiAlPiUgCiAgYWRkX3Jlc2lkdWFscyhtb2QyYikgJT4lIAogIHNlbGVjdCgtYyhhdmVyYWdlX3ByaWNlLCBpc19vcmdhbmljLCByZWdpb24pKQoKYXZvY2Fkb19yZXNpZCAlPiUgCiAgc2VsZWN0KHJlc2lkLCBldmVyeXRoaW5nKCkpICU+JSAKICBnZ3BhaXJzKCkKYGBgCgpCYXNlZCBvbiB0aGlzLCBsb29rIGF0IHRoZSBmb2xsb3dpbmcgZm9yIHRoZSBzZWNvbmQgcHJlZGljdG9yOgoKKiBgcHJvcF9zaW5nbGVfYmFnc2AKKiBgbW9udGhgCgoKYGBge3J9Cm1vZDNhIDwtIGxtKGF2ZXJhZ2VfcHJpY2UgfiBpc19vcmdhbmljICsgcmVnaW9uICsgcHJvcF9zaW5nbGVfYmFncywKICAgICAgICAgICAgZGF0YSA9IGF2b2NhZG9fdHJhaW4pCgphdXRvcGxvdChtb2QzYSkKYGBgCgoKYGBge3J9CnN1bW1hcnkobW9kM2EpCmBgYAoKUmVzdWx0czoKCiogUnZGIHN0aWxsIHNlZW0gdG8gYmUgZ2V0dGluZyBtb3JlIGVycm9ycyBmb3IgYSBoaWdoZXIgZml0dGVkIHZhbHVlCiogU2NhbGUtTG9jYXRpb24gbG9va3MgYmV0dGVyIGJ1dCBzdGlsbCAnbW9yZSB3cm9uZycgdGhlIGhpZ2hlciB0aGUgZml0dGVkIHZhbHVlcwoqIE5vcm1hbCBRLVEgbGlmdHMgb2ZmIHRoZSBsaW5lIHRvd2FyZHMgdGhlIGhpZ2hlciBxdWFudGlsZXMKKiBBY2NvdW50cyBmb3IgfiA0OSUgb2YgdGhlIHZhcmlhdGlvbgoKYGBge3J9Cm1vZDNiIDwtIGxtKGF2ZXJhZ2VfcHJpY2UgfiBpc19vcmdhbmljICsgcmVnaW9uICsgbW9udGgsCiAgICAgICAgICAgIGRhdGEgPSBhdm9jYWRvX3RyYWluKQoKYXV0b3Bsb3QobW9kM2IpCmBgYAoKCmBgYHtyfQpzdW1tYXJ5KG1vZDNiKQpgYGAKCgpgYGB7cn0KYW5vdmEobW9kMmIsIG1vZDNiKQpgYGAKClJlc3VsdHM6CgoqIFJ2RiBsb29rcyBiZXR0ZXIsIGJ1dCBoaWdoZXIgZXJyb3JzIGZvciBoaWdoZXIgZml0dGVkIHZhbHVlcwoqIFNjYWxlLUxvY2F0aW9uIGVycm9ycyBiZWNvbWUgbW9yZSBzcHJlYWQgb3V0LCBzbGlnaHQgY29uZSBzaGFwZQoqIE5vcm1hbCBRLVEgc3RhcnRpbmcgdG8gbGlmdCBvZmYgYXQgYm90aCBlbmRzIG9mIHRoZSBjaGFydAoqIEFjY291bnRzIGZvciB+IDUzJSBvZiB0aGUgdmFyaWF0aW9uCiogQ2hlY2sgdGhhdCBhbGwgcmVzdWx0cyBjYW4gYmUgaW5jbHVkZWQgdXNpbmcgYW5vdmEKKiAqKkNob29zZSB0aGlzIGFzIHRoZSBwcmVkaWN0b3IqKgoKUGxvdCB0aGUgcmVzaWR1YWxzCgpgYGB7cn0KYXZvY2Fkb19yZXNpZCA8LSBhdm9jYWRvX3RyYWluICU+JSAKICBhZGRfcmVzaWR1YWxzKG1vZDNiKSAlPiUgCiAgc2VsZWN0KC1jKGF2ZXJhZ2VfcHJpY2UsIGlzX29yZ2FuaWMsIHJlZ2lvbiwgbW9udGgpKQoKYXZvY2Fkb19yZXNpZCAlPiUgCiAgc2VsZWN0KHJlc2lkLCBldmVyeXRoaW5nKCkpICU+JSAKICBnZ3BhaXJzKCkKYGBgCgpCYXNlZCBvbiB0aGlzLCBsb29rIGF0IHRoZSBmb2xsb3dpbmcgZm9yIHRoZSBmb3VydGggcHJlZGljdG9yOgoKKiBgcHJvcF9zaW5nbGVfYmFnc2AKKiBgeWVhcmAKKiBgbG5fdG90YWxfYmFnc2AKCgpgYGB7cn0KbW9kNGEgPC0gbG0oYXZlcmFnZV9wcmljZSB+IGlzX29yZ2FuaWMgKyByZWdpb24gKyBtb250aCArIHByb3Bfc2luZ2xlX2JhZ3MsCiAgICAgICAgICAgIGRhdGEgPSBhdm9jYWRvX3RyYWluKQoKYXV0b3Bsb3QobW9kNGEpCmBgYAoKCmBgYHtyfQpzdW1tYXJ5KG1vZDRhKQpgYGAKCgpgYGB7cn0KYW5vdmEobW9kM2IsIG1vZDRhKQpgYGAKClJlc3VsdHM6CgoqIFJ2RiBsb29rcyBiZXR0ZXIKKiBTY2FsZS1Mb2NhdGlvbiBzdGlsbCBoYXMgYSBzbGlnaHQgY29uZSBzaGFwZQoqIE5vcm1hbCBRLVEgbGlmdHMgb2ZmIG1vcmUgYXQgdGhlIHRvcAoqIEFjY291bnRzIGZvciB+IDU1JSBvZiB0aGUgdmFyaWF0aW9uCiogQ2hlY2sgZm9yIHNpZ25pZmljYW5jZSB1c2luZyBhbm92YQoKCmBgYHtyfQptb2Q0YiA8LSBsbShhdmVyYWdlX3ByaWNlIH4gaXNfb3JnYW5pYyArIHJlZ2lvbiArIG1vbnRoICsgeWVhciwKICAgICAgICAgICAgZGF0YSA9IGF2b2NhZG9fdHJhaW4pCgphdXRvcGxvdChtb2Q0YikKYGBgCgoKYGBge3J9CnN1bW1hcnkobW9kNGIpCmBgYAoKCmBgYHtyfQphbm92YShtb2QzYiwgbW9kNGIpCmBgYAoKUmVzdWx0czoKCiogUnZGIGxvb2tzIGJldHRlcgoqIFNjYWxlLUxvY2F0aW9uIGhhcyBtb3JlIG9mIGEgY3BuZSBzaGFwZSB0aGFuIG1vZDRhCiogTm9ybWFsIFEtUSBsaWZ0cyBvZmYgbW9yZSBhdCB0aGUgYm90aCBlbmRzCiogQWNjb3VudHMgZm9yIH4gNTYlIG9mIHRoZSB2YXJpYXRpb24KKiBDaGVjayBmb3Igc2lnbmlmaWNhbmNlIHVzaW5nIGFub3ZhCgoKYGBge3J9Cm1vZDRjIDwtIGxtKGF2ZXJhZ2VfcHJpY2UgfiBpc19vcmdhbmljICsgcmVnaW9uICsgbW9udGggKyBsbl90b3RhbF9iYWdzLAogICAgICAgICAgICBkYXRhID0gYXZvY2Fkb190cmFpbikKCmF1dG9wbG90KG1vZDRjKQpgYGAKCgpgYGB7cn0Kc3VtbWFyeShtb2Q0YykKYGBgCgoKYGBge3J9CmFub3ZhKG1vZDNiLCBtb2Q0YykKYGBgCgoKUmVzdWx0czoKCiogUnZGIGxvb2tzIG9rCiogU2NhbGUtTG9jYXRpb24gaGFzIG1vcmUgb2YgYSBjb25lIHNoYXBlIHRoYW4gbW9kNGEKKiBOb3JtYWwgUS1RIGxpZnRzIG9mZiBtb3JlIGF0IHRoZSAgZW5kCiogQWNjb3VudHMgZm9yIH4gNTUlIG9mIHRoZSB2YXJpYXRpb24KKiBDaGVjayBmb3Igc2lnbmlmaWNhbmNlIHVzaW5nIGFub3ZhCiogKipDaG9vc2UgdGhpcyBtb2RlbCoqCgoKVGVzdCB0aGUgbW9kZWwKCmBgYHtyfQpwcmVkaWN0aW9uc190ZXN0IDwtIGF2b2NhZG9fdGVzdCAlPiUgCiAgYWRkX3ByZWRpY3Rpb25zKG1vZDRjKSAlPiUgCiAgc2VsZWN0KGF2ZXJhZ2VfcHJpY2UsIHByZWQpCgpwcmVkaWN0aW9uc190ZXN0CmBgYAoKYGBge3J9CnByZWRpY3Rpb25zX3Rlc3QgPC0gcHJlZGljdGlvbnNfdGVzdCAlPiUgCiAgbXV0YXRlKHNxX2VyciA9IChwcmVkIC0gYXZlcmFnZV9wcmljZSleMikKCm1zZV90ZXN0IDwtIG1lYW4ocHJlZGljdGlvbnNfdGVzdCRzcV9lcnIpCm1zZV90ZXN0ICMgbm9ybWFsbHkgdGhpcyB3b3VsZCBiZSBzcXJ0J2QgLT4gUk1TRQoKc3FydChtc2VfdGVzdCkKYGBgCgoKYGBge3J9CnByZWRpY3Rpb25zX3RyYWluIDwtIGF2b2NhZG9fdHJhaW4gJT4lIAogIGFkZF9wcmVkaWN0aW9ucyhtb2Q0YykgJT4lIAogIHNlbGVjdChhdmVyYWdlX3ByaWNlLCBwcmVkKQoKcHJlZGljdGlvbnNfdHJhaW4KYGBgCgoKYGBge3J9CnByZWRpY3Rpb25zX3RyYWluIDwtIHByZWRpY3Rpb25zX3RyYWluICU+JSAKICBtdXRhdGUoc3FfZXJyID0gKHByZWQgLSBhdmVyYWdlX3ByaWNlKSBeIDIpCgptc2VfdHJhaW4gPC0gbWVhbihwcmVkaWN0aW9uc190cmFpbiRzcV9lcnIpCm1zZV90cmFpbgoKc3FydChtc2VfdHJhaW4pCmBgYAoKCgpLLWZvbGQgY3Jvc3MgdmFsaWRhdGlvbgoKYGBge3J9CmxpYnJhcnkoY2FyZXQpCmBgYAoKCmBgYHtyfQpjdl8xMF9mb2xkIDwtIHRyYWluQ29udHJvbChtZXRob2QgPSAiY3YiLAogICAgICAgICAgICAgICAgICAgICAgICAgICBudW1iZXIgPSAxMCwKICAgICAgICAgICAgICAgICAgICAgICAgICAgc2F2ZVByZWRpY3Rpb25zID0gVFJVRSkKCmF2b19tb2RlbCA8LSB0cmFpbihhdmVyYWdlX3ByaWNlIH4gaXNfb3JnYW5pYyArIHJlZ2lvbiArIG1vbnRoICsgeWVhciwKICAgICAgICAgICAgICAgICAgIGRhdGEgPSBhdm9jYWRvX3RyYWluLAogICAgICAgICAgICAgICAgICAgdHJDb250cm9sID0gY3ZfMTBfZm9sZCwKICAgICAgICAgICAgICAgICAgIG1ldGhvZCA9ICJsbSIpCmBgYAoKCmBgYHtyfQphdm9fbW9kZWwkcHJlZApgYGAKCgpgYGB7cn0KYXZvX21vZGVsJHJlc2FtcGxlCmBgYAoKCmBgYHtyfQptZWFuKGF2b19tb2RlbCRyZXNhbXBsZSRSTVNFKQpgYGAKCgpgYGB7cn0KbWVhbihhdm9fbW9kZWwkcmVzYW1wbGUkUnNxdWFyZWQpCmBgYAoK